So, in Part 1 of the workshop, we worked through our raw data to make it fit for downstream analysis. This type of data is called as processed/clean data.
Loading packages:
library(tidyverse)Loading our processed data from Part 1:
eNorm <- read.delim("data/eNorm.txt", sep = "\t")
eNorm <- eNorm %>%
column_to_rownames(var = "gene")
pDat <- read.delim("data/GSE157103_formatted_pDat.txt", sep = "\t")
pDat <- pDat %>%
column_to_rownames(var = "ID")We’ll start with principle component analysis (PCA) - a dimensionality reduction method that accounts for sample variation while maximizing variance.
## transforming eNorm values to log2(x)+1
e_log2 <- log2(eNorm + 1)
## transposing our log(x+1) transposed data frame, so that the columns
## become the rows, and the rows become columns. As we want to check
## the variance driven by the genes, and not the samples, we transpose
## the dataframe to have the rows as the samples, and the columns as the genes,
## as the PCA function performs column-wise applications, not row-wise.
t_log2 <- as.data.frame(t(e_log2))
## As our data has already been normalized, we don't want to scale it further.
## We do however, want to centre it - meaning standardizing the upper and lower
## limits of the distribution of our values
pca <- prcomp(t_log2, scale = FALSE, center = TRUE)
# print(pca)
# summary(pca)
# screeplot(pca)
library("factoextra", quietly = TRUE)## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
eig.val<-get_eigenvalue(pca)
head(eig.val)## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 1685.5669 34.022321 34.02232
## Dim.2 1013.4825 20.456635 54.47896
## Dim.3 284.6378 5.745272 60.22423
## Dim.4 266.1820 5.372750 65.59698
## Dim.5 173.9535 3.511165 69.10814
## Dim.6 125.3862 2.530857 71.63900
## dataframe with all PCs, their variance, and cumulative variance of all PCs
summary <- data.frame(PC = 1:126, var_explained = (pca$sdev)^2 / sum((pca$sdev)^2),
cumulative = cumsum(pca$sdev^2 / sum(pca$sdev^2))
)
summary <- summary %>%
mutate(cumulative_perc = cumulative*100)
## we only consider the first 30 PCs
summary <- summary[1:30,]
## different ways to represent the same data
summary %>%
ggplot(aes(x = sort(as.factor(PC)), y = var_explained)) +
geom_bar(stat = "identity", fill = "forest green") +
geom_text(aes(label = round(var_explained, digits = 2), vjust = -0.8), size = 2) +
theme_minimal() +
labs(title = "Variance Explained by each PC") summary %>%
ggplot(aes(x = sort(as.factor(PC)), y = var_explained))+
geom_point(colour = "forest green") +
geom_line(group = "PC", colour = "forest green") +
theme_minimal() +
labs(title = "Variance Explained by each PC")## or easily used by calling function in the "factoextra" package
fviz_eig(pca, col.var="blue")summary %>%
ggplot(aes(x = sort(as.factor(PC)), y = cumulative_perc))+
geom_point(colour = "forest green") +
geom_line(group = "PC", colour = "forest green") +
theme_minimal() +
labs(title = "Cumulative Proportion of Variation") ## separating the PCA values into its own separate df
scores <- as.data.frame(pca$x)
## take the first 30
scores <- scores[c(1:30)]
head(scores)## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## C1 -2.873577 31.513465 4.581241 -34.747719 15.702070 -29.361896 8.508419
## C2 1.069681 21.246050 -10.968403 8.339987 -3.260639 -17.241597 -3.523015
## C3 -21.265732 -16.398176 11.405154 -49.406015 -10.698952 -6.128537 -1.064602
## C4 8.360660 48.599969 9.782527 12.568988 2.640562 -2.135567 -3.014367
## C5 -52.264090 7.886006 7.748767 -25.925634 9.453192 15.599042 -6.826804
## C6 13.613215 29.320890 -6.705883 15.198558 -7.176141 1.188698 -10.093523
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## C1 -10.146967 -1.7393485 8.528270 -2.4318774 1.7644941 -5.1124617 1.642345
## C2 -3.660366 -0.8253942 -1.797925 -5.2304072 -0.7391959 -15.2686948 -2.139551
## C3 -5.071375 18.4334570 14.031947 -3.2970120 -6.8373987 -0.4272142 1.500038
## C4 -2.971492 -5.1869985 1.106609 0.6095576 -5.0401862 -6.4507389 2.681354
## C5 2.492451 -2.4695017 -19.801383 -5.0695824 -6.2003436 -2.4090045 11.883342
## C6 -3.208975 2.7844656 -6.127613 -1.0578476 0.4531912 -10.9567302 3.506287
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## C1 -8.875898 9.958008 -2.257577 0.0760982 5.0013025 -3.701882 11.100054
## C2 -1.673642 -3.371377 3.338661 -4.5877960 4.8236714 -2.603036 4.839628
## C3 1.668607 8.304741 -5.059460 6.8785575 9.3845361 3.353635 5.716499
## C4 -9.781740 3.395398 2.955598 3.5514763 4.7594520 0.723566 2.178126
## C5 3.812600 10.180299 19.058720 -4.1820518 1.2522601 6.604281 -12.063440
## C6 1.709447 -2.951148 2.932438 -2.6384673 0.2379606 -2.517254 5.712442
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## C1 -1.1830862 -4.3701516 -6.0269453 2.487731 -3.9176775 7.2084391 -1.931781
## C2 -0.1437366 -4.0622984 0.6567662 1.469711 -3.0097441 1.4143064 -1.387003
## C3 7.6056574 1.1696635 -1.0229573 1.078533 1.9816533 -0.2157957 1.227088
## C4 -5.2577541 8.6100230 1.0761861 -0.495693 0.2191656 -0.8076055 2.522992
## C5 4.2031459 4.3432118 3.7112434 3.547634 3.0213863 4.0804927 -4.757769
## C6 2.2821878 -0.9523062 -1.6960234 2.700263 3.7529861 2.8236221 -2.476955
## PC29 PC30
## C1 7.1364450 -1.27354936
## C2 -2.0213152 -1.19972570
## C3 7.3628155 12.38335042
## C4 -3.9418235 -0.03496177
## C5 0.7855329 0.31440811
## C6 0.1467464 0.12057683
## making a metadata data.frame containing all sample information data
mDat <- cbind(pDat, scores)Now that we have our PC scores, we’ll estimate which of our variables are the ones driving that variation in our data
var <- get_pca_var(pca)
var## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
library("corrplot", quietly = TRUE)## corrplot 0.92 loaded
## contribution: first 20 genes, first 10 PCs
corrplot(var$contrib[1:20, 1:10], is.corr=FALSE)## Cos2 is called square cosine (squared coordinates) and corresponds
## to the quality of representation of variables
corrplot(var$cos2[1:20, 1:10], is.corr=FALSE)## NOT RUN
# fviz_pca_var(pca,
# col.var = "cos2", # Color by the quality of representation
# gradient.cols = c("darkorchid4", "gold", "darkorange"),
# repel = TRUE
# )
## top10 contributions of variables to PC1
a<-fviz_contrib(pca, choice = "var", axes = 1, top=10)
## top 10 contributions of variables to PC2
b<-fviz_contrib(pca, choice = "var", axes = 2, top=10)
library("gridExtra", quietly = TRUE)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(a,b, ncol=2, top='Top10 contribution of the variables to the first two PCs')## The results, for individuals (athletes) will be extracted using the
## function get_pca_ind(). Similarly to variables, it provides a list
## of matrices containing all the results for the individuals (coordinates,
## correlation between individuals and axes, squared cosine, and contributions).
ind <- get_pca_ind(pca)
ind## Principal Component Analysis Results for individuals
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the individuals"
## 2 "$cos2" "Cos2 for the individuals"
## 3 "$contrib" "contributions of the individuals"
fviz_pca_ind(pca,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("darkorchid4", "gold", "darkorange"),
repel = TRUE
)## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Total contribution on PC1 and PC2
fviz_contrib(pca, choice = "ind", axes = 1:2)We’ll now plot the first 2 PCs with the variables that seem to be contributing to the most variance in the data.
mDat %>%
ggplot(aes(x = PC1, y = PC2, colour = COVID)) +
geom_point(size = 3) +
coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
labs( x = "Principle Component 1", y = "Principle Component 2", title = "COVID: PC1 vs PC2") +
scale_colour_manual(values = c("grey", "orange")) +
theme_minimal() mDat %>%
ggplot(aes(x = PC1, y = PC2, colour = ICU)) +
geom_point(size = 3) +
coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
labs( x = "Principle Component 1", y = "Principle Component 2", title = "ICU: PC1 vs PC2") +
scale_colour_manual(values = c("grey", "blue")) +
theme_minimal() mDat %>%
ggplot(aes(x = PC1, y = PC2, colour = Mechanical_Ventilation)) +
geom_point(size = 3) +
coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
labs( x = "Principle Component 1", y = "Principle Component 2", title = "Mechanical Ventilation: PC1 vs PC2") +
scale_colour_manual(values = c("grey", "purple")) +
theme_minimal()mDat %>%
# mutate(AP_score = case_when(
# APACHEII_Score <= 10 ~ "less_than_10",
# between(APACHEII_Score, 11, 20) ~ "eleven_to_20",
# between(APACHEII_Score, 21, 30) ~ "twentyone_to_30",
# between(APACHEII_Score, 31, 40) ~ "thirtyone_to_40",
# APACHEII_Score > 40 ~ "more_than_40")) %>%
ggplot(aes(x = PC1, y = PC2, colour = APACHEII_Score)) +
geom_point(size = 3) +
coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
labs( x = "Principle Component 1", y = "Principle Component 2", title = "APACHEII_Score", subtitle = "Score of disease-severity measured upon admittance to ICU") +
theme_minimal() ## Compare PC2 and PC3
mDat %>%
ggplot(aes(x = PC1, y = PC2, colour = ICU)) +
geom_point(size = 3) +
coord_cartesian(ylim = c(-130, 130), xlim = c(-130, 130)) +
labs( x = "Principle Component 1", y = "Principle Component 2", title = "ICU: PC1 vs PC2") +
scale_colour_manual(values = c("grey", "blue")) +
theme_minimal() mDat %>%
ggplot(aes(x = PC2, y = PC3, colour = ICU)) +
geom_point(size = 3) +
coord_cartesian(ylim = c(-100, 100), xlim = c(-100, 100)) +
labs( x = "Principle Component 2", y = "Principle Component 3", title = "ICU: PC2 vs PC3") +
scale_colour_manual(values = c("grey", "blue")) +
theme_minimal() Okay, now moving into DE analysis: we’re going to use the limma package, rather than the more popular DESeq2 or edgeR packages. There’s broadly 3 steps to pulling out DE genes:
library(limma)
## step 1
mm_covid <- model.matrix(~COVID, pDat)
## always better to use an intercept, as the starting value is not forced to zero
all(rownames(pDat) == colnames(eNorm))## [1] TRUE
## step 2
efit_COVID <- lmFit(eNorm, mm_covid)
## step 3
efit_COVID <- efit_COVID %>%
eBayes()
topTable(efit_COVID, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "p")## logFC AveExpr t P.Value adj.P.Val B
## GBGT1 -14.346965 13.119938 -8.608852 2.704926e-14 3.208042e-10 21.74699
## MCM6 13.006445 20.821294 8.119125 3.863965e-13 1.759204e-09 19.24480
## SLC12A9 -95.036340 100.648629 -8.092907 4.449926e-13 1.759204e-09 19.11189
## SLC15A3 -59.232204 68.814556 -7.998129 7.405414e-13 2.195705e-09 18.63245
## MAPK8IP3 -18.433847 29.398845 -7.874219 1.437384e-12 3.409474e-09 18.00811
## CDC7 3.365151 5.656738 7.711515 3.417089e-12 6.754446e-09 17.19281
## AIF1 -323.686454 472.519567 -7.566778 7.346232e-12 1.244662e-08 16.47216
## CLSPN 2.952676 4.133007 7.437656 1.448122e-11 2.146841e-08 15.83317
## PITPNM1 -16.397367 24.699942 -7.409046 1.682170e-11 2.216727e-08 15.69211
## TWF2 -84.289346 100.564424 -7.327147 2.580083e-11 3.059979e-08 15.28940
topTable(efit_COVID, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "logFC")## logFC AveExpr t P.Value adj.P.Val B
## S100A9 -10554.113 22372.842 -2.567381 1.142823e-02 2.912609e-02 -3.14921169
## HBA2 -4627.381 2670.516 -5.072714 1.385177e-06 3.487942e-05 5.06699674
## ACTB -4416.516 7242.785 -6.023213 1.782930e-08 1.639190e-06 9.14052918
## FTL -4187.909 6897.358 -5.544648 1.679337e-07 7.572980e-06 7.03827981
## IFITM2 -2848.844 5357.986 -3.659910 3.710775e-04 1.980639e-03 -0.09398016
## RNA28SN1 -2740.657 2785.014 -5.705683 7.982667e-08 4.733722e-06 7.73475274
## RNA28SN4 -2351.735 1437.686 -7.147384 6.556764e-11 4.860201e-08 14.41135701
## EEF1A1 -2146.417 8071.037 -3.468840 7.182910e-04 3.236676e-03 -0.69295220
## FCGR3B 2140.546 6069.773 2.383893 1.863926e-02 4.307319e-02 -3.56958200
## HBA1 -2103.973 1667.291 -3.847665 1.893569e-04 1.225359e-03 0.51944745
## google GBGT1, S100A9 and COVID - what do we find?We know from our PCA that age doesn’t seem to contribute to the variation observed. Let’s check whether controlling for age in our model changes the results we obtained
## We'll first get some statistics on the quality of our model with including age
## logistic requires categorical to be either yes or no (1 or 0)
model1 <- glm(as.factor(COVID) ~ Age, data = pDat, family = binomial)
summary(model1)##
## Call:
## glm(formula = as.factor(COVID) ~ Age, family = binomial, data = pDat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9352 0.5739 0.6543 0.7122 0.7818
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.08533 0.88314 2.361 0.0182 *
## Age -0.01187 0.01353 -0.877 0.3804
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 128.29 on 125 degrees of freedom
## Residual deviance: 127.49 on 124 degrees of freedom
## AIC: 131.49
##
## Number of Fisher Scoring iterations: 4
## Here the summary shows that age does not seem to strongly correlate
## with COVID status, and so hence we would not expect a major change
## in our results on including it in our model. However, just to test
## that, let's add it to out model and check the resutls.
mm_age <- model.matrix(~COVID + Age, pDat)
efit_age <- lmFit(eNorm, mm_age) %>%
eBayes()
topTable(efit_age, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "p")## logFC AveExpr t P.Value adj.P.Val B
## GBGT1 -14.046860 13.119938 -8.550910 3.893914e-14 4.618182e-10 21.40403
## MCM6 12.757246 20.821294 8.034319 6.344663e-13 3.003183e-09 18.77807
## SLC12A9 -93.350898 100.648629 -8.000681 7.596584e-13 3.003183e-09 18.60857
## SLC15A3 -58.215586 68.814556 -7.902345 1.284422e-12 3.808311e-09 18.11420
## MAPK8IP3 -18.489458 29.398845 -7.844907 1.743990e-12 4.136744e-09 17.82627
## CDC7 3.324990 5.656738 7.608243 6.104652e-12 1.206686e-08 16.64678
## AIF1 -321.225164 472.519567 -7.472091 1.247953e-11 2.114389e-08 15.97360
## PITPNM1 -16.437456 24.699942 -7.376130 2.060166e-11 2.972291e-08 15.50168
## TTC9 -24.915019 19.432974 -7.350907 2.349422e-11 2.972291e-08 15.37799
## CLSPN 2.923208 4.133007 7.338497 2.506148e-11 2.972291e-08 15.31720
topTable(efit_age, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "logFC")## logFC AveExpr t P.Value adj.P.Val B
## S100A9 -10160.956 22372.842 -2.468919 1.491768e-02 3.595278e-02 -3.3784598
## HBA2 -4648.774 2670.516 -5.061858 1.465260e-06 3.794319e-05 5.0153718
## ACTB -4316.973 7242.785 -5.916899 3.003894e-08 2.698953e-06 8.6518054
## FTL -4089.673 6897.358 -5.436198 2.784468e-07 1.182517e-05 6.5659556
## RNA28SN1 -2764.601 2785.014 -5.723978 7.430527e-08 5.123607e-06 7.8026755
## IFITM2 -2700.867 5357.986 -3.528938 5.866230e-04 2.833951e-03 -0.5090658
## RNA28SN4 -2357.059 1437.686 -7.114010 8.004557e-11 6.328936e-08 14.2240520
## EEF1A1 -2349.765 8071.037 -4.066164 8.445041e-05 6.874275e-04 1.2600294
## FCGR3B 2193.313 6069.773 2.430801 1.650034e-02 3.894029e-02 -3.4649414
## HBA1 -2114.040 1667.291 -3.839525 1.957299e-04 1.271976e-03 0.4898601
## We see that when arranged by logFC and by adjusted pvalue our model with
## and without age shows the same ordering of the genes.We saw that lactate concentration was contributing to PC2. Let’s check if we should be adjusting for this variable.
mm_lactate <- model.matrix(~COVID + Lactate_mmol.l , pDat)
mm_lactate_df <- as.data.frame(mm_lactate)
lactate_logres <- glm(COVIDyes ~ Lactate_mmol.l, data = mm_lactate_df, family = binomial)
summary(lactate_logres)##
## Call:
## glm(formula = COVIDyes ~ Lactate_mmol.l, family = binomial, data = mm_lactate_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1839 0.4396 0.4876 0.6916 1.3768
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.2880 0.3943 5.803 6.5e-09 ***
## Lactate_mmol.l -0.8370 0.2641 -3.170 0.00153 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 128.29 on 125 degrees of freedom
## Residual deviance: 114.03 on 124 degrees of freedom
## AIC: 118.03
##
## Number of Fisher Scoring iterations: 5
## The summary shows that lactate indeed does seem to be significantly associated
## with COVID status. Let's visualise that
mm_lactate_df %>%
ggplot(aes(x = Lactate_mmol.l, y = COVIDyes)) +
geom_point(alpha = 0.2, colour = "orange") +
geom_smooth(method = "glm", method.args = list(family = "binomial"), colour = "orange") +
theme_minimal() +
labs(title = "Does lactate concentration inform of COVID status?", x = "Lactate (mmol/l)", y = "Probability of COVID-positive status")## `geom_smooth()` using formula 'y ~ x'
## so now we know that there is a significant association with lactate levels and
## the probability of having COVID. Let's add lactate to our linear model
efit_lactate <- lmFit(eNorm, mm_lactate) %>%
eBayes()
topTable(efit_lactate, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "p")## logFC AveExpr t P.Value adj.P.Val B
## GBGT1 -14.100701 13.119938 -7.922312 1.154622e-12 1.369382e-08 18.16473
## MCM6 13.263355 20.821294 7.753202 2.837841e-12 1.682840e-08 17.32187
## CDC7 3.529122 5.656738 7.600076 6.372660e-12 1.698344e-08 16.56355
## PELI3 -3.847807 4.537970 -7.598845 6.414097e-12 1.698344e-08 16.55747
## MAPK8IP3 -18.933196 29.398845 -7.577943 7.159967e-12 1.698344e-08 16.45435
## SLC15A3 -59.226784 68.814556 -7.483197 1.177384e-11 2.327296e-08 15.98809
## HAAO -3.877558 3.365732 -7.443598 1.448515e-11 2.454199e-08 15.79380
## NAPSA -29.225402 24.012864 -7.415859 1.674445e-11 2.482365e-08 15.65792
## METRNL -18.080742 15.634706 -7.384769 1.969362e-11 2.595182e-08 15.50584
## RNA18SN2 -1933.844371 1965.367916 -7.305644 2.972739e-11 2.938057e-08 15.11982
topTable(efit_lactate, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "logFC")## logFC AveExpr t P.Value adj.P.Val B
## HBA2 -5206.811 2670.516 -5.407114 3.175498e-07 1.226756e-05 6.440438
## FTL -4358.981 6897.358 -5.408454 3.156355e-07 1.226756e-05 6.446072
## ACTB -4327.541 7242.785 -5.524856 1.860821e-07 8.827734e-06 6.938547
## B2M 3371.789 16010.351 2.430849 1.649823e-02 3.649595e-02 -3.434133
## RNA28SN1 -3152.997 2785.014 -6.283311 5.161600e-09 8.745225e-07 10.288178
## EEF1A1 -2603.769 8071.037 -4.003939 1.067173e-04 7.237743e-04 1.061398
## TXNIP 2533.727 6352.420 2.621531 9.852509e-03 2.406813e-02 -2.990803
## FCGR3B 2494.685 6069.773 2.611941 1.011797e-02 2.458496e-02 -3.013814
## RNA28SN4 -2483.527 1437.686 -7.097047 8.733519e-11 6.092914e-08 14.109575
## HBA1 -2331.664 1667.291 -4.010926 1.039621e-04 7.102482e-04 1.085312
Let’s compare the expression of GBGT1 and HBA2 between COVID positive and negative patients.
pDat <- pDat %>%
rownames_to_column(var = "sample")
covid <- pDat %>%
dplyr::select(sample, COVID)
GBGT1 <- eNorm %>%
rownames_to_column(var = "gene") %>%
filter(gene == "GBGT1") %>%
column_to_rownames(var = "gene")
GBGT1 <- as.data.frame(t(GBGT1))
GBGT1 <- GBGT1 %>%
rownames_to_column(var = "sample")
GBGT1 <- GBGT1 %>%
left_join(covid, by = "sample")
GBGT1 %>%
ggplot(aes(x = COVID, y = log2(GBGT1), fill = COVID)) +
geom_violin() +
scale_fill_manual(values = c("gray", "orange")) +
theme_minimal() +
theme(legend.position = "bottom") +
labs(x = "COVID Status", y = "log2 (GBGT1 RPM)", title = "GBGT1: Gene with lowest adjusted p-value with or without accounting for lactate")HBA2 <- eNorm %>%
rownames_to_column(var = "gene") %>%
filter(gene == "HBA2") %>%
column_to_rownames(var = "gene")
HBA2 <- as.data.frame(t(HBA2))
HBA2 <- HBA2 %>%
rownames_to_column(var = "sample")
HBA2 <- HBA2 %>%
left_join(covid, by = "sample")
HBA2 %>%
ggplot(aes(x = COVID, y = log2(HBA2), fill = COVID)) +
geom_violin() +
geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.7, fill = "black") +
scale_fill_manual(values = c("gray", "orange")) +
theme_minimal() +
theme(legend.position = "bottom") +
labs(x = "COVID Status", y = "log2 (HBA2 RPM)", title = "HBA2: Gene with highest negative logFC change on including lactate concentration in the model")## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
# HBA2 %>%
# ggplot(aes(x = sample, y = log2(HBA2), colour = COVID)) +
# geom_point() +
# scale_colour_manual(values = c("gray", "orange")) +
# theme_minimal() +
# theme(legend.position = "bottom") +
# labs(x = "COVID Status", y = "log2 (HBA2 RPM)", title = "HBA2: Gene with highest negative logFC change on including lactate concentration in the model") +
# facet_grid(~COVID)a popular way of combining the above information - p-values and FCs is a volcano plot. Let’s now select all of the DEG that we got when we included lactate measurement in out model to generate a volcano plot.
volcanoplot(efit_lactate, coef = "COVIDyes", highlight=5, style = "p-value", names = rownames(efit_lactate$coefficients), hl.col="blue")volcanoplot(efit_lactate, coef = "COVIDyes", highlight=5, style = "p-value", names = rownames(efit_lactate$coefficients), hl.col="blue", xlim=c(-500,500))a = topTable(efit_lactate, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.05, sort.by = "p")# BiocManager::install("biomaRt")
library(biomaRt)
listMarts() #gives us the list of databases available## Ensembl site unresponsive, trying useast mirror
## biomart version
## 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 108
## 2 ENSEMBL_MART_MOUSE Mouse strains 108
## 3 ENSEMBL_MART_SNP Ensembl Variation 108
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 108
ensembl=useMart("ensembl")
## which all species are present from the ensembl database?
head(listDatasets(ensembl))## dataset description
## 1 abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1)
## 2 acalliptera_gene_ensembl Eastern happy genes (fAstCal1.2)
## 3 acarolinensis_gene_ensembl Green anole genes (AnoCar2.0v2)
## 4 acchrysaetos_gene_ensembl Golden eagle genes (bAquChr1.2)
## 5 acitrinellus_gene_ensembl Midas cichlid genes (Midas_v5)
## 6 amelanoleuca_gene_ensembl Giant panda genes (ASM200744v2)
## version
## 1 ASM259213v1
## 2 fAstCal1.2
## 3 AnoCar2.0v2
## 4 bAquChr1.2
## 5 Midas_v5
## 6 ASM200744v2
## we'll make a variable selecting the Homo sapiens dataset
mart = useMart("ensembl", dataset="hsapiens_gene_ensembl")## Ensembl site unresponsive, trying uswest mirror
## Using the DEGs we got from the lactate model
genes <- topTable(efit_lactate, coef = "COVIDyes", adjust.method = "fdr", p.value = 0.01, n = Inf, sort.by = "logFC")
genes <- rownames(genes)
## We'll only use the top 200 genes, as the maximum number of queries biomaRt can take is 500
genes <- genes[1:200]
## we require the Entrz IDs for all functions after this step - so converting HGNC Symbols to Entrez IDs
hgnc_to_entrez <- getBM(attributes = c("hgnc_symbol", "entrezgene_id"), filters = "hgnc_symbol", values = genes, mart = mart)
## check all choices
filters <- listFilters(mart) #filters are the parameters you search by
attr <- listAttributes(mart) #attributes are the matching parameters you're wanting to search for
head(hgnc_to_entrez)## hgnc_symbol entrezgene_id
## 1 ACTB 60
## 2 ACTG1 71
## 3 ADAM8 101
## 4 ADAR 103
## 5 AIF1 199
## 6 ALDOA 226
## selecting attributes as the GO id, the GO term, the GO term definition, and the cell
## compartment that GO term belongs to, searching by the filter/parameter HGNC symbol
go_terms <- getBM(attributes = c("hgnc_symbol", "go_id", "name_1006", "definition_1006", "namespace_1003"), filters = "hgnc_symbol", values = genes, mart = mart)
head(go_terms)## hgnc_symbol go_id name_1006
## 1 ACTB GO:0005634 nucleus
## 2 ACTB GO:0005737 cytoplasm
## 3 ACTB GO:0016020 membrane
## 4 ACTB GO:0032991 protein-containing complex
## 5 ACTB GO:0000166 nucleotide binding
## 6 ACTB GO:0005524 ATP binding
## definition_1006
## 1 A membrane-bounded organelle of eukaryotic cells in which chromosomes are housed and replicated. In most cells, the nucleus contains all of the cell's chromosomes except the organellar chromosomes, and is the site of RNA synthesis and processing. In some species, or in specialized cell types, RNA metabolism or DNA replication may be absent.
## 2 The contents of a cell excluding the plasma membrane and nucleus, but including other subcellular structures.
## 3 A lipid bilayer along with all the proteins and protein complexes embedded in it an attached to it.
## 4 A stable assembly of two or more macromolecules, i.e. proteins, nucleic acids, carbohydrates or lipids, in which at least one component is a protein and the constituent parts function together.
## 5 Binding to a nucleotide, any compound consisting of a nucleoside that is esterified with (ortho)phosphate or an oligophosphate at any hydroxyl group on the ribose or deoxyribose.
## 6 Binding to ATP, adenosine 5'-triphosphate, a universally important coenzyme and enzyme regulator.
## namespace_1003
## 1 cellular_component
## 2 cellular_component
## 3 cellular_component
## 4 cellular_component
## 5 molecular_function
## 6 molecular_function
## deleting all empty rows
go_terms <- go_terms %>%
mutate_all(na_if,"")
go_terms <- na.omit(go_terms)
## counting the frequency of each GO term
go_plot <- go_terms %>%
dplyr::count(name_1006) %>%
dplyr::arrange(desc(n))
## we know that the total DEGs we selected were 200, so let's get the
## percentage of how many of the genes were associated with a particular GO Term
head(go_plot)## name_1006 n
## 1 protein binding 169
## 2 cytosol 138
## 3 membrane 133
## 4 cytoplasm 131
## 5 extracellular exosome 93
## 6 nucleus 89
go_plot$total <- 200
go_plot <- go_plot[-1,]
go_plot <- go_plot %>%
mutate(perc = (n/total)*100) %>%
dplyr::arrange()
## for the first 20 GO Terms
go_plot[1:20,] %>%
ggplot(aes(x = reorder(name_1006, -perc), y = perc)) +
geom_bar(stat = "identity", width = 0.6) +
coord_cartesian(y = c(0,100)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 20 GO Terms", subtitle = "For DEGs at adjpval <= 0.05", x = "GO Term", y = "Percentage of DEGs assoc. with GO Term")## let's all add the cellular compartment to our plot
component <- go_terms %>%
dplyr::select(name_1006, namespace_1003) %>%
distinct(name_1006, .keep_all = TRUE)
head(component)## name_1006 namespace_1003
## 1 nucleus cellular_component
## 2 cytoplasm cellular_component
## 3 membrane cellular_component
## 4 protein-containing complex cellular_component
## 5 nucleotide binding molecular_function
## 6 ATP binding molecular_function
go_plot <- go_plot %>%
right_join(component, by = "name_1006")
head(go_plot)## name_1006 n total perc namespace_1003
## 1 cytosol 138 200 69.0 cellular_component
## 2 membrane 133 200 66.5 cellular_component
## 3 cytoplasm 131 200 65.5 cellular_component
## 4 extracellular exosome 93 200 46.5 cellular_component
## 5 nucleus 89 200 44.5 cellular_component
## 6 RNA binding 88 200 44.0 molecular_function
go_plot[1:20,] %>%
ggplot(aes(x = reorder(name_1006, -perc), y = perc, fill = namespace_1003)) +
geom_bar(stat = "identity", width = 0.6) +
scale_fill_manual(values = c("maroon", "navy", "forest green")) +
coord_cartesian(y = c(0,100)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom") +
scale_y_continuous(breaks = seq(0, 100, 10)) +
labs(title = "Top 20 GO Terms", subtitle = "For DEGs at adjpval <= 0.05", x = "GO Term", y = "Percentage of DEGs assoc. with GO Term")Let’s look at the KEGG pathways associated with our DEGs
# BiocManager::install("clusterProfiler")
library(clusterProfiler, quietly = TRUE)##
## clusterProfiler v4.4.4 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:biomaRt':
##
## select
## The following object is masked from 'package:purrr':
##
## simplify
## The following object is masked from 'package:stats':
##
## filter
k <- enrichKEGG(gene = genes, organism = "hsa")## Reading KEGG annotation online:
## Reading KEGG annotation online:
## --> No gene can be mapped....
## --> Expected input gene ID: 27349,23205,5373,80201,7366,729920
## --> return NULL...
## when we input our genes as HGNC IDs, the function doesn't work. We'll use our Entrez IDs that we have
head(hgnc_to_entrez)## hgnc_symbol entrezgene_id
## 1 ACTB 60
## 2 ACTG1 71
## 3 ADAM8 101
## 4 ADAR 103
## 5 AIF1 199
## 6 ALDOA 226
k <- enrichKEGG(gene = hgnc_to_entrez$entrezgene_id, organism = "hsa")
head(k)## ID Description GeneRatio BgRatio
## hsa03010 hsa03010 Ribosome 76/149 158/8191
## hsa05171 hsa05171 Coronavirus disease - COVID-19 78/149 232/8191
## hsa04145 hsa04145 Phagosome 11/149 152/8191
## hsa05140 hsa05140 Leishmaniasis 7/149 77/8191
## hsa05416 hsa05416 Viral myocarditis 6/149 60/8191
## hsa05152 hsa05152 Tuberculosis 10/149 180/8191
## pvalue p.adjust qvalue
## hsa03010 4.621372e-97 8.364683e-95 7.588779e-95
## hsa05171 1.147343e-84 1.038345e-82 9.420288e-83
## hsa04145 9.586089e-05 5.783607e-03 5.247122e-03
## hsa05140 4.747203e-04 2.148109e-02 1.948852e-02
## hsa05416 7.302291e-04 2.643429e-02 2.398226e-02
## hsa05152 1.599476e-03 4.825087e-02 4.377515e-02
## geneID
## hsa03010 2197/6134/4736/6135/6136/6137/23521/9045/6138/6139/6141/6142/6143/6144/6146/9349/6147/6152/6154/6155/6157/6158/6159/6122/6156/6160/6161/6164/11224/6165/6173/6167/6168/6169/6170/6124/6171/6125/6128/6129/6130/6132/6133/6175/6176/6181/6204/6205/6206/6208/6209/6210/6217/6218/6222/6223/6187/6224/6228/6229/6230/6231/6232/6233/6234/6235/6188/6189/6191/6193/6194/6201/6202/6203/3921/7311
## hsa05171 103/2197/6134/4736/6135/6136/6137/23521/9045/6138/6139/6141/6142/6143/6144/6146/9349/6147/6152/6154/6155/6157/6158/6159/6122/6156/6160/6161/6164/11224/6165/6173/6167/6168/6169/6170/6124/6171/6125/6128/6129/6130/6132/6133/6175/6176/6181/6204/6205/6206/6208/6209/6210/6217/6218/6222/6223/6187/6224/6228/6229/6230/6231/6232/6233/6234/6235/6188/6189/6191/6193/6194/6201/6202/6203/3921/6772/7311
## hsa04145 60/71/9114/11151/1535/3113/3115/3123/3689/10312/7846
## hsa05140 1535/1915/3113/3115/3123/3689/6772
## hsa05416 60/71/3113/3115/3123/3689
## hsa05152 9114/972/11151/3113/3115/3123/3689/4046/6772/10312
## Count
## hsa03010 76
## hsa05171 78
## hsa04145 11
## hsa05140 7
## hsa05416 6
## hsa05152 10
kegg_res = k@result